capture program drop textreg_predict
program textreg_predict 
	syntax varname using/ [, Name_new_var(string) Stem  stem_Lang(string) ]

    if `"`name_new_var'"' == `""' { 
		local name_new_var = "predict_"
	}  

    if `"`stem_lang'"' == `""' { 
		local stem_lang = "english"
	}

	python: textreg_predict(varname="`varlist'" , model_path="`using'", name_new_var="`name_new_var'", stem="`stem'",  stem_lang="`stem_lang'")

end



python
# load necessary python packages
from sfi import Data
import pandas as pd
from nltk.stem import SnowballStemmer as sns
import pickle


    

def textreg_predict(varname, model_path, name_new_var, stem, stem_lang):
 
    print("Loading Data from Stata")
    X = Data.get(varname) 
    X = pd.Series(X)


    print("Loading Model: {}".format(model_path) )
    with open( model_path  , 'rb') as f:
        cv = pickle.load(f)
        model = pickle.load(f) 


    if stem:
		
        #set stemmer to specified language
        stemmer = sns(stem_lang)

        def stem(string):
            "splits and stemms a string variable and removes stopwords"
            stems = [stemmer.stem(word) for word in string.lower().split()]
            return " ".join(stems)

        print("Stemming text")
        X = X.apply(stem)


    prediction = model.predict(cv.transform(X)) 

	# path the variables back to stata
    Data.addVarDouble(name_new_var)
    Data.store(var=name_new_var , val = prediction, obs=None)

end











